# -*- coding: utf-8 -*-

# Scrapy settings for article_spider project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
#     https://doc.scrapy.org/en/latest/topics/settings.html
#     https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
#     https://doc.scrapy.org/en/latest/topics/spider-middleware.html

import os

PROJECT_DIR = os.path.abspath(os.path.dirname(__file__))

BOT_NAME = 'article_spider'

SPIDER_MODULES = ['article_spider.spiders']
NEWSPIDER_MODULE = 'article_spider.spiders'

# Crawl responsibly by identifying yourself (and your website) on the user-agent
# USER_AGENT = 'article_spider (+http://www.yourdomain.com)'

# Obey robots.txt rules
# 如果启用,Scrapy将会采用 robots.txt策略
ROBOTSTXT_OBEY = False

# Configure maximum concurrent requests performed by Scrapy (default: 16)
# Scrapy downloader 并发请求(concurrent requests)的最大值,默认: 16
CONCURRENT_REQUESTS = 8

# Configure a delay for requests for the same website (default: 0)
# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
DOWNLOAD_DELAY = 3
CONCURRENT_ITEMS = 100
# The download delay setting will honor only one of:
# 对单个网站进行并发请求的最大值
CONCURRENT_REQUESTS_PER_DOMAIN = 16
# 对单个IP进行并发请求的最大值。如果非0,则忽略 CONCURRENT_REQUESTS_PER_DOMAIN 设定,使用该设定。
# 也就是说,并发限制将针对IP,而不是网站。该设定也影响 DOWNLOAD_DELAY: 如果 CONCURRENT_REQUESTS_PER_IP 非0,下载延迟应用在IP而不是网站上
CONCURRENT_REQUESTS_PER_IP = 0
DEPTH_LIMIT = 0
DEPTH_PRIORITY = 0
DNSCACHE_ENABLED = True

# Disable cookies (enabled by default)
# 禁用Cookie（默认情况下启用）
COOKIES_ENABLED = False
COOKIES_DEBUG = False

# Disable Telnet Console (enabled by default)
# 禁用Telnet控制台（默认启用）
# TELNETCONSOLE_ENABLED = False

# Override the default request headers:
# 覆盖默认请求标头
DEFAULT_REQUEST_HEADERS = {
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
    'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
    'Accept-Encoding': 'gzip, deflate, sdch',
    'Connection': 'keep-alive'
}

# Enable or disable spider middlewares
# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
# SPIDER_MIDDLEWARES = {
#    'article_spider.middlewares.ArticleSpiderSpiderMiddleware': 543,
# }

# Enable or disable downloader middlewares
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
DOWNLOADER_MIDDLEWARES = {
    'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': None,
    # 'article_spider.contrib.downloadermiddleware.rotate_proxy.ProxyMiddleWare': 300,
    'article_spider.contrib.downloadermiddleware.rotate_useragent.RotateUserAgentMiddleware': 300,
    'article_spider.contrib.downloadermiddleware.selenium_ware.SeleniumDownloadMiddleware': 301,
}

# Enable or disable extensions
# See https://doc.scrapy.org/en/latest/topics/extensions.html
# EXTENSIONS = {
#    'scrapy.extensions.telnet.TelnetConsole': None,
# }

# Configure item pipelines
# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
    'article_spider.pipelines.mysql_twisted_pipeline.MysqlTwistedPipeline': 300
}

RETRY_ENABLED = False
## 允许http状态码,默认会有200
HTTPERROR_ALLOWED_CODES = [302, 403, 502]

# Enable and configure the AutoThrottle extension (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/autothrottle.html
# 启用和配置AutoThrottle扩展（默认情况下禁用）
AUTOTHROTTLE_ENABLED = True
AUTOTHROTTLE_START_DELAY = 5
AUTOTHROTTLE_CONCURRENCY_CHECK_PERIOD = 10  # How many responses should pass to perform concurrency adjustments.
# The initial download delay
AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
# 在高延迟的情况下设置的最大下载延迟
AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
# Scrapy请求的平均数量应该并行发送每个远程服务器
AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
# 启用显示所收到的每个响应的调节统计信息：
AUTOTHROTTLE_DEBUG = True

# Enable and configure HTTP caching (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
# 启用和配置HTTP缓存（默认情况下禁用）
# HTTPCACHE_ENABLED = True
# HTTPCACHE_EXPIRATION_SECS = 0
# HTTPCACHE_DIR = 'httpcache'
# HTTPCACHE_IGNORE_HTTP_CODES = []
# HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'

LOG_FILE = "logs/scrapy.log"
LOG_LEVEL = "DEBUG"

# mysql settings
MYSQL_HOST = 'localhost'
MYSQL_PORT = 3306
MYSQL_USER = 'root'
MYSQL_PWD = '123456'
MYSQL_DB = 'article'
MYSQL_MAX_COUNT = 30
MYSQL_TIMEOUT = 60

# ajax获取的站点信息
AJAX_DOMAINS = [
    'toutiao.com',
    'tmtpost.com',
    'zhihu.com',
    'list.jd.com',
    'music.163.com',
]

## ajax获取更多页面最大页码
## ps:若是第一次运行最好设置不要大于100页,每天抓取最好不要超过10页
AJAX_MAX_PAGE_SIZE = 80

# 带iframe域名
IFRAME_DOMAINS = [
    '163.com'
]

## 动态页面运行需要文件路径以及文件
CHROME_DRIVER_FILE=r"D:\projects\tools\chromedriver.exe"
# CHROME_DRIVER_FILE = r"/Users/weiwei/Documents/chromedriver"

# user agents
USER_AGENTS = [
    'Mozilla/5.0 (X11; Linux i686) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.43 Safari/537.31',
    'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1312.60 Safari/537.17',
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_2) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1309.0 Safari/537.17',
    'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.2; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0)',
    'Mozilla/5.0 (Windows; U; MSIE 7.0; Windows NT 6.0; en-US)',
    'Mozilla/5.0 (Windows; U; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 2.0.50727)',
    'Mozilla/6.0 (Windows NT 6.2; WOW64; rv:16.0.1) Gecko/20121011 Firefox/16.0.1',
    'Mozilla/5.0 (X11; Ubuntu; Linux i686; rv:15.0) Gecko/20100101 Firefox/15.0.1',
    'Mozilla/5.0 (Windows NT 6.2; WOW64; rv:15.0) Gecko/20120910144328 Firefox/15.0.2',
    'Mozilla/5.0 (Windows; U; Windows NT 6.1; rv:2.2) Gecko/20110201',
    'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9a3pre) Gecko/20070330',
    'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10.6; en-US; rv:1.9.2.13; ) Gecko/20101203',
    'Opera/9.80 (Windows NT 6.0) Presto/2.12.388 Version/12.14',
    'Opera/9.80 (X11; Linux x86_64; U; fr) Presto/2.9.168 Version/11.50',
    'Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; de) Presto/2.9.168 Version/11.52',
    'Mozilla/5.0 (Windows; U; Win 9x 4.90; SG; rv:1.9.2.4) Gecko/20101104 Netscape/9.1.0285',
    'Mozilla/5.0 (Macintosh; U; PPC Mac OS X Mach-O; en-US; rv:1.8.1.7pre) Gecko/20070815 Firefox/2.0.0.6 Navigator/9.0b3',
    'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.1.12) Gecko/20080219 Firefox/2.0.0.12 Navigator/9.0.0.6',
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.71 Safari/537.36',
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.11; rv:49.0) Gecko/20100101 Firefox/49.0',
    'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.80 Safari/537.36 Core/1.47.933.400 QQBrowser/9.4.8699.400',
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.71 Safari/537.36',
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_1) AppleWebKit/602.2.14 (KHTML, like Gecko) Version/10.0.1 Safari/602.2.14',
    'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36',
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/602.1.50 (KHTML, like Gecko) Version/10.0 Safari/602.1.50',
    'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; Win64; x64; Trident/4.0; .NET CLR 2.0.50727; SLCC2; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET4.0C; .NET4.0E; Tablet PC 2.0)',
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36',
    'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/48.0.2564.82 Chrome/48.0.2564.82 Safari/537.36',
    'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36',
    'Mozilla/5.0 (Windows NT 10.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/40.0.2214.93 Safari/537.36',
    'Mozilla/5.0 (X11; OpenBSD i386) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.125 Safari/537.36',
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1664.3 Safari/537.36',
    'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6',
    'Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5',
    'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER',
    'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)',
    'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11',
    'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',
    'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)',
    'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11',
    'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SV1; QQDownload 732; .NET4.0C; .NET4.0E; 360SE)',
    'Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)',
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.20 (KHTML, like Gecko) Chrome/19.0.1036.7 Safari/535.20',
    'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6',
    'Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10',
    'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER',
    'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.89 Safari/537.1',
    'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)',
    'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.0.12) Gecko/20070731 Ubuntu/dapper-security Firefox/1.5.0.12',
    'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E; LBBROWSER)',
    'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.89 Safari/537.1',
    'Mozilla/5.0 (iPhone; CPU iPhone OS 10_3 like Mac OS X) AppleWebKit/603.1.30 (KHTML, like Gecko) Version/10.3 Mobile/14E277 Safari/603.1.30',
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36',
]

# proxy ip
PROXY_IPS = [
    'https://183.30.197.19:9797', 'https://116.25.240.211:3128', 'https://112.95.88.50:9999',
    'https://121.23.194.123:9000', 'https://59.39.129.104:9000', 'https://27.46.35.151:9797',
    'https://123.112.23.41:53281', 'https://27.46.20.144:888', 'https://113.87.163.144:9000',
    'https://210.82.28.57:8080', 'https://183.172.58.22:8118', 'https://183.33.128.38:808',
    'https://116.30.120.88:9000', 'https://115.171.111.191:9000', 'https://113.77.240.164:9797',
    'https://101.93.200.150:9000', 'https://112.95.191.191:9999', 'https://163.125.70.20:9999',
    'https://14.155.19.1:9000', 'https://27.46.20.114:888', 'https://112.95.20.22:9999',
    'https://163.125.70.183:9999', 'https://125.123.143.143:9000', 'https://125.123.136.6:9000',
    'https://106.113.242.193:9999', 'https://113.116.145.183:9000', 'https://59.38.62.131:9797',
    'https://123.13.245.66:9999', 'https://113.87.160.103:9000', 'https://112.95.189.246:9999',
    'https://59.59.148.37:53281', 'https://60.168.87.165:8888', 'https://121.201.38.71:41599',
    'https://27.44.155.237:9999', 'https://221.217.48.189:9000', 'https://119.27.177.169:80',
    'https://125.123.136.251:9000', 'https://14.155.16.89:9000', 'https://27.44.173.202:9999',
    'https://222.209.189.127:53281', 'https://120.78.180.231:8118', 'https://27.44.173.24:9999',
    'https://14.155.113.95:9000', 'https://27.184.124.108:8118', 'https://58.244.54.247:8080',
    'https://163.125.17.38:8888', 'https://115.171.202.104:9000', 'https://14.20.235.133:9797',
    'https://222.186.45.149:63505', 'https://58.38.9.25:9000', 'https://27.44.171.134:9999',
    'https://119.139.197.101:3128', 'https://116.226.65.49:9000', 'https://119.29.67.149:8118',
    'https://119.123.241.213:9000', 'https://114.99.255.202:8118', 'https://117.65.38.180:63909',
    'https://60.179.248.175:6666', 'https://112.95.206.210:9999', 'https://112.95.206.195:8888',
    'https://119.29.252.90:3128', 'https://27.44.171.149:9999', 'https://220.198.96.243:9999',
    'https://106.113.242.114:9999', 'https://125.118.74.129:3128', 'https://1.196.160.121:9999',
    'https://60.208.140.150:53281', 'https://163.125.71.115:9999', 'https://222.186.15.182:51814',
    'https://113.116.127.46:9797', 'https://114.249.113.250:9000', 'https://171.37.29.139:9797',
    'https://219.131.242.45:9797', 'https://116.17.8.121:9999', 'https://183.30.197.24:9797',
    'https://219.137.64.165:5555', 'https://123.121.62.122:9000', 'https://111.194.245.14:9000',
    'https://113.116.146.17:9000', 'https://117.65.35.226:63909', 'https://221.217.52.58:9000',
    'https://116.30.121.94:9000', 'https://101.81.104.23:9000', 'https://163.125.71.78:8888',
    'https://14.20.235.186:808', 'https://120.55.164.140:8123', 'https://27.37.47.31:9000',
    'https://112.95.191.21:9797', 'https://115.46.64.33:8123', 'https://122.241.73.60:808',
    'https://175.155.136.84:1133', 'https://106.56.102.142:808', 'https://106.56.102.31:8070',
    'https://171.113.156.26:8010', 'https://1.197.59.95:61234', 'https://182.34.20.196:53128',
    'https://61.164.39.66:53281', 'https://114.225.170.243:53128', 'https://117.64.236.218:808',
    'https://175.175.218.112:1133', 'https://171.113.159.63:8010', 'https://106.56.102.148:8070',
    'https://110.85.89.8:45862', 'https://115.46.74.223:8123', 'https://123.161.154.213:41345',
    'https://171.38.78.221:8123', 'https://27.153.128.219:32559', 'https://106.56.102.26:808',
    'https://175.148.77.68:1133', 'https://115.46.73.125:8123', 'https://180.118.243.234:808',
    'https://114.225.170.124:53128', 'https://101.201.49.204:8080', 'https://112.85.72.122:53128',
    'https://171.38.79.235:8123', 'https://175.148.72.228:1133', 'https://106.56.102.109:808',
    'https://115.46.64.179:8123', 'https://115.46.96.132:8123', 'https://115.46.77.55:8123',
    'https://171.37.164.139:8123', 'https://36.33.25.26:808', 'https://114.234.80.208:53128',
    'https://36.22.198.172:8010', 'https://106.56.102.18:8070', 'https://221.227.14.128:8070',
    'https://110.72.41.166:8123', 'https://106.56.102.156:808', 'https://27.153.128.68:8010',
    'https://106.56.102.241:8070', 'https://113.57.34.234:808', 'https://115.151.1.39:808',
    'https://171.39.2.188:8123', 'https://112.194.234.67:53128', 'https://106.56.102.34:808',
    'https://123.53.132.161:8010', 'https://114.225.171.161:53128', 'https://222.95.22.53:53128',
    'https://123.180.69.229:8010', 'https://115.219.107.132:8010', 'https://125.109.193.33:8010',
    'https://182.240.39.104:8118', 'https://101.236.43.153:8866', 'https://59.173.75.114:8010',
    'https://106.56.102.51:8070', 'https://49.87.135.48:53128', 'https://106.56.102.135:8070',
    'https://115.46.75.41:8123', 'https://60.184.174.152:808', 'https://36.25.111.193:8010',
    'https://123.161.157.248:36869', 'https://123.180.69.223:8010', 'https://59.173.74.113:8010',
    'https://111.72.154.74:53128', 'https://115.46.89.221:8123', 'https://115.46.97.120:8123',
    'https://106.56.102.167:8070', 'https://27.16.163.60:8010', 'https://221.229.18.122:808',
    'https://171.39.28.131:8123', 'https://121.31.157.32:8123', 'https://114.225.168.114:53128',
    'https://114.223.166.119:8118', 'https://121.31.157.224:8123', 'https://115.223.118.23:8010',
    'https://121.31.155.194:8123', 'https://121.31.157.4:8123', 'https://121.69.13.242:53281',
    'https://58.54.220.91:26528', 'https://222.190.222.59:27623', 'https://123.162.192.59:38065',
    'https://117.86.15.66:44706', 'https://123.161.237.118:26383', 'https://60.184.108.54:25564',
    'https://27.31.103.79:28029', 'https://180.122.147.160:32506', 'https://114.230.216.195:28623',
    'https://49.89.85.235:37234', 'https://49.85.6.3:45041', 'https://27.31.102.201:36835',
    'https://123.160.225.9:29302', 'https://49.85.4.13:24316', 'https://171.14.234.99:42467',
    'https://221.227.249.225:33403', 'https://60.169.221.230:20234', 'https://115.46.75.208:8123'
]
